# Import the packages
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import pandas as pd
import datetime as dt
# Load data on shootings
shoot = pd.read_csv("C:/Users/alber/OneDrive/Desktop/MUSA620/HW2/shootings.csv")
shoot.head(n=20)
# Remove unwanted columns
unwanted = ['the_geom', 'lng','year','dc_key','code','the_geom_webmercator','offender_deceased','dist','time','inside','outside','lat','offender_injured']
shooting = shoot.drop(unwanted, axis=1)
shooting
# Convert date data from strings to datetime objects
shooting['date_'] = pd.to_datetime(shooting['date_'])
shooting['Month'] = shooting['date_'].dt.month
shooting['Year'] = shooting['date_'].dt.year
shooting['Day'] = shooting['date_'].dt.day
shooting.head(n=20)
# Check the elements in column 'Year'
shooting['Year'].unique()
# Select the races I want (Get rid of the strange data)
valid_race = ['W','B','A','I']
selection = shooting['race'].isin(valid_race)
shooting_used = shooting.loc[selection]
# Check the elements in column "race"
shooting_used['race'].unique()
# Calculate the total amount of shootings for each race
shooting_Race = shooting_used.groupby(['race'])['count'].count()
# Reset the index so that the index values are listed as columns in the data frame again
shooting_Race = shooting_Race.reset_index()
shooting_Race
# Rename the columns
shooting_Race.columns = ['Race','Shooting_Amount']
shooting_Race
# Initialize the figure
plt.figure(figsize=(16,8))
# Plot a pie chart
ax1 = plt.subplot(121, aspect='equal')
explode = (0, 0.1, 0, 0) # only "explode" the 2nd slice (i.e. 'Hogs')
shooting_Race.plot(kind='pie', explode=explode, y = 'Shooting_Amount', ax=ax1, autopct='%1.1f%%',
startangle=90, shadow=False, labels=shooting_Race['Race'], legend = True, fontsize=14)
At first, I noticed that the shooting victims are racial diversely. So, I decided to make a pie chart to get more senses about the extents that different races involved and got hurt in the shootings. Because the Matplotlib is a basic package for data visualization, I have a lot of freedom to customize the graph by myself. So, after setting up the basic axis information of the graph, I inputed the data of shootings and divided them by races. After setting the size of the fonts, the percentages of victimes in each race are shown on the graph properly. Finally, I exploded the chart by divided the largest part from the other parts of it.
By observing the chart, I found that : 1) More than 80% of the shooting victims between 2015 and 2019 were Black. 2) There were only a small number of Asian and Indian victims.
# Calculate the total amount of shootings for each race and sex
shooting_RaceSex = shooting_used.groupby(['Year','race','sex'])['count'].count()
# Reset the index so that the index values are listed as columns in the data frame again
shooting_RaceSex = shooting_RaceSex.reset_index()
shooting_RaceSex
# Create a new column by concatenating the columns of races and sexes
shooting_RaceSex['Race_and_Sex'] = shooting_RaceSex['race'] +'-'+ shooting_RaceSex['sex']
print(shooting_RaceSex)
# Rename the columns
shooting_RaceSex.columns = ['Year','Race','Sex','Amount','Race_and_Sex']
shooting_RaceSex
# Get the total number of shootings for each year from 2015 to 2019
shooting_Number = shooting_RaceSex.groupby(['Year'])['Amount'].sum()
# Reset the index so that the index values are listed as columns in the data frame again
shooting_Number = shooting_Number.reset_index()
shooting_Number.columns = ['Year','Total']
shooting_Number
# Merge the two dataframes
Merge = shooting_RaceSex.merge(shooting_Number)
# Calculate the percentage of shootings for each race and sex to the total number for each year
Merge['Percent']=Merge['Amount']/Merge['Total']*100
Merge
# Initialize the figure and axes
fig, ax = plt.subplots(figsize=(15, 10))
# Color for each race-sex
color_map = {"A-M": "#F2C335","A-F": "#F2D479", "B-M": "#000000","B-F": "#6C6F73", "W-M": "#D53711" ,"W-F": "#DB6E53" ,"I-M": "#034AA6","I-F": "#79D0F2"}
# Plot each race-sex
for Race_and_Sex, group in Merge.groupby("Race_and_Sex"):
print(f"Plotting {Race_and_Sex}...")
# Plot year vs amount of shootings for this group
ax.plot(
group["Year"],
group["Amount"],
marker="o",
label=Race_and_Sex,
color=color_map[Race_and_Sex],
alpha=2,
)
# Format the axes
ax.legend(loc="best")
ax.set_xlabel("Shooting Date")
ax.set_ylabel("Amount of Shooting per Year")
ax.set_ylim(-150, 1300)
ax.grid(True)
After knowing the general composition of the shooting victims among different races, I decided to dig in more about the change of the amount of shootings among not only different race but also different sex groups between 2015 to 2019. So, I made a line chart. As mentioned before, because the Matplotlib is a basic package for data visualization, I have a lot of freedom to customize the graph by myself. So, after setting up the basic axis information of the graph, I set the colors for each group: Yellow for Asians, Black for Black people, Red for White People, and Blue for Indians. In addition, the colors of males were set deeper than females. By setting the shooting date (year) in x-axis and the number of shooting victims per year in y-axis, I got the line chart.
By observing the chart, I found that : 1) The number of black male victims got to the highest point in 2018 after a short-term decrease from 2016 to 2017. 2) The number of white male victims slightly increased from 2015 to 2017 and declined from 2017 to 2018. 3) There were not many Asian and Indian victims, so the numbers of them seems staying stable all the time. 4) The numbers of male victims are always more than female victims in the same races.
# Import the package
import seaborn as sns
# Initialize the figure and axes
fig, ax = plt.subplots(figsize=(10, 8))
# Plot the chart
ax = sns.barplot(x="Race_and_Sex", y="Percent", hue="Year", palette='magma', data=Merge)
After knowing the general trends of changing of the numbers of the shooting victims among different race-sex groups, I decided to dig in more about the change of the percentage of shootings among these groups between 2015 to 2019 to see if some specific groups of people became more vulnerable in recent years. So, I made a bar chart by using Seaborn. In fact, Seaborn is more advanced than Matplotlib with simplified codes for data visulaization. So, by setting the race-sex groups in x-axis and the percentages of shooting victims in y-axis, I got the bar chart. To make the chart looking prettier, I set the color palette and made the background in dark color.
By observing the chart, I found that : 1) The percentages of both black male and female victims increased from 2017 to 2018 after decreases from 2015 to 2017. In addition, the percentage of black female victims got the highest point in recent years even though there were only 9-month data. 2) The percentage of white male victims increased from 2015 to 2017 and declined from 2017 to 2018, and that of white females continuously decreased in recent years. 3) There were not many Asian and Indian victims. 4) The percentage of black male victims is always more than 75% of all victims.
# Get the number of shootings for each month
shooting_1 = shooting_used.groupby(['Year','Month'])['count'].count()
# Reset the index so that the index values are listed as columns in the data frame again
shooting_1 = shooting_1.reset_index()
# Rename the columns
shooting_1.columns = ['Year','Month','Shootings']
shooting_1
sns.set()
# Convert the wide dataframe to long-form
shooting_2 = shooting_1.pivot("Month", "Year", "Shootings")
# Draw a heatmap with the numeric values (number of shootings) in each cell
f, ax = plt.subplots(figsize=(10, 6))
sns.heatmap(shooting_2, annot=True, fmt="g", linewidths=.5, cmap='viridis', ax=ax)
Since Seaborn is more advanced than Matplotlib, it can be used to visulaize data in some more effective charts rather the basic plots. Because I want to check if there were more shootings happened in some specific months. To make the chart looking prettier, I set the color palette as "viridis" to show the cells with larger number lighter and those with smaller number darker.
By observing the chart, I found that : 1) There were always a large amount of shootings in summer (June to August) and few in winter, especially in January and February.
# Import the pachage
import altair as alt
alt.renderers.enable('notebook')
shooting_used
# Remove the limitation of row numbers
alt.data_transformers.disable_max_rows()
# Plot the graph
alt.Chart(shooting_used).mark_bar().encode(
y='race:O',
x='mean(age):Q',color='race'
).transform_bin(
'Race', field='race'
).interactive()
In the graphs I made before, I already got some senses about the differences of the shootings happened among different race and sex groups. So, at this moment, I wanted to investigate the mean age of the victims among different racial groups. During making the chart, I set the mean age as the X axis and race as the Y axis. Finally, I made the chart interactively.
By observing the chart, I found that : 1) Mean age of victims among Asians is the highest (higher than 35 years old). 2) Mean age of Black victims is the lowest, whihc is lower than 30 years old. Most victims are very young!
# Plot the graph
alt.Chart(shooting_used).mark_bar().encode(
x='race:O',
y='count()',color='sex'
).transform_bin(
'binned_rating', field='race'
)
This chart conveyed the similar information as the line chart and bar chart I made in the former 2 sections. However, in this graph, I transformed the binning and showed different sexes together in a same bar.
By observing the chart, I found that : 1) The majority of the victims are Black. 2) Males are more easily to get involved in shootings than females.
# Include the brush selection in plot
brush = alt.selection(type='interval')
# The top scatterplot
upper = alt.Chart().mark_point().encode(
alt.X('date_:T',scale=alt.Scale(domain=brush)),
y='age:Q',
color=alt.condition(brush, 'race:N', alt.value('lightgray'))
).properties(
selection=brush,
width=800
)
# The bottom bar plot
lower = alt.Chart().mark_bar().encode(
y='race:N',
color='race:N',
x='count(race):Q'
).transform_filter(
brush.ref()
).properties(
width=800
).interactive()
chart = alt.vconcat(upper, lower, data=shooting_used) # vertical stacking
chart
In this 2-chart dashboard, I plotted the shooting records as scatterplot in every day from 2015 to 2019 to check the age and race distributions of the victims. The connection of these two charts and the application of the brush selection gave me more sense of the number of victims in each race increasing in a period of time.
By observing the chart, I found that : 1) The majority of the victims are Black. 2) The black victims seem particularly young.
# Include the brush selection in plot
brush = alt.selection(type='interval')
# The top area plot
lines = alt.Chart().mark_area().encode(
alt.Y('sum(count):Q',scale=alt.Scale(domain=brush)),
alt.X('Year:N',scale=alt.Scale(domain=brush)),
color=alt.condition(brush, 'race:N', alt.value('lightgray'))
).properties(
selection=brush,
width=800
)
# The bottom bar plot
bars = alt.Chart().mark_bar().encode(
y='race:N',
color='race:N',
x='count(race):Q'
).transform_filter(
brush.ref() # the filter transform uses the selection
# to filter the input data to this chart
).properties(
width=800
).interactive()
chart = alt.vconcat(lines, bars, data=shooting_used) # vertical stacking
chart
In this 2-chart dashboard, I plotted the total shooting records per year for each race as area plots to see the number of the victims in each race. The connection of these two charts and the application of the brush selection gave me more sense of the number of victims in each race increasing in a period of time (years).
By observing the chart, I found that : 1) The majority of the victims are Black. 2) From 2015 to 2017, the number of White victims increased more than any other races, and from 2017 to 2018, the number of Black victims increased the most.
# Plot the graph with the number of shootings based on the races and sexes of victims
alt.Chart(shooting_used).mark_line().encode(
x="Year:N",
y="sum(count):Q",
color="race:N"
).properties(width=200,
height=200
).facet(column="race",row="sex").interactive()
In the last group of charts by using Altair, I plotted the facetting plots with 8 charts in it. In fact, each of the chart belongs to one race-sex group. After setting the Y axis for total number of shootings, and X axis as years (2015 to 2019), I can easily do comparison between these groups (especially between the groups of the same sex). In addition, by setting the graphs to be interactive, I can change the scale of the Y axis for all 8 charts at the same time, which is good for me to get to know the numbers more exactly.
By observing the chart, I found that : 1) The number of the black-male victims are far more than other race-male groups. 2) The numbers of victims as Asian and Indians are really small.